introduction%201.png

introduction%202.png

introduction%203.png

Exploratory analysis of monkeypox data using Plotly, Pandas, Wordcloud, etc.¶

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import seaborn as sns
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
In [2]:
data = pd.read_csv("https://raw.githubusercontent.com/globaldothealth/monkeypox/main/latest.csv",low_memory=False)

print(data.shape)
(39001, 36)
In [3]:
data.head()
Out[3]:
ID Status Location City Country Country_ISO3 Age Gender Date_onset Date_confirmation ... Source Source_II Source_III Source_IV Source_V Source_VI Source_VII Date_entry Date_death Date_last_modified
0 N1 confirmed Guy's and St Thomas Hospital London London England GBR NaN NaN 2022-04-29 2022-05-06 ... https://www.gov.uk/government/news/monkeypox-c... https://www.who.int/emergencies/disease-outbre... NaN NaN NaN NaN NaN 2022-05-18 NaN 2022-05-18
1 N2 confirmed Guy's and St Thomas Hospital London London England GBR NaN NaN 2022-05-05 2022-05-12 ... https://www.gov.uk/government/news/monkeypox-c... NaN NaN NaN NaN NaN NaN 2022-05-18 NaN 2022-05-18
2 N3 confirmed London London England GBR NaN NaN 2022-04-30 2022-05-13 ... https://www.gov.uk/government/news/monkeypox-c... NaN NaN NaN NaN NaN NaN 2022-05-18 NaN 2022-05-18
3 N4 confirmed London London England GBR NaN male NaN 2022-05-15 ... https://www.gov.uk/government/news/monkeypox-c... NaN NaN NaN NaN NaN NaN 2022-05-18 NaN 2022-05-18
4 N5 confirmed London London England GBR NaN male NaN 2022-05-15 ... https://www.gov.uk/government/news/monkeypox-c... NaN NaN NaN NaN NaN NaN 2022-05-18 NaN 2022-05-18

5 rows × 36 columns

Missingness of the data¶

  • data.isna().sum(): count the number of missing observations per column in the datset
In [4]:
# Create dataframe counting NaN values per column
nan_df = pd.DataFrame(data.isna().sum()).reset_index()
nan_df.columns  = ['Column', 'NaN_Count']
nan_df['NaN_Count'] = nan_df['NaN_Count'].astype('int')
nan_df['NaN_%'] = round(nan_df['NaN_Count']/data.shape[0] * 100,1)
nan_df['Type']  = 'Missingness'
nan_df.sort_values('NaN_%', inplace = True)

# Add completeness: 1- missingness%
for i in range(nan_df.shape[0]):
    complete_df = pd.DataFrame([nan_df.loc[i,'Column'],data.shape[0] - nan_df.loc[i,'NaN_Count'],100 - nan_df.loc[i,'NaN_%'], 'Completeness']).T
    complete_df.columns  = ['Column','NaN_Count','NaN_%','Type']
    complete_df['NaN_%'] = complete_df['NaN_%'].astype('int')
    complete_df['NaN_Count'] = complete_df['NaN_Count'].astype('int')
    nan_df = nan_df.append(complete_df, sort = True)

nan_df.head()  
Out[4]:
Column NaN_% NaN_Count Type
0 ID 0.0 0 Missingness
33 Date_entry 0.0 0 Missingness
26 Source 0.0 0 Missingness
5 Country_ISO3 0.0 0 Missingness
35 Date_last_modified 0.0 0 Missingness
In [5]:
# Missingness Plot
fig = px.bar(nan_df,
             x = 'Column',
             y = 'NaN_%',
             title = 'Missingness within this Dataset',
             color = 'Type',
             template = 'plotly_dark',
             opacity = 0.6,
             width = 800,
             height = 450,
             color_discrete_sequence = ['#dbdbdb','#38cae0']
            )
fig.update_yaxes(title = 'Percentage of NaNs')
fig.update_xaxes(title = 'Column Name')

plt.show(block = False)
In [6]:
fig.show()

Geographical distribution of confirmed cases¶

  • confirmed_data[['Country_ISO3','ID']].groupby('Country_ISO3').agg('count').reset_index() : count the number of confirmed cases by country
In [7]:
# we can pull confirmed cases
confirmed_data = data.loc[data["Status"] == "confirmed"]
data_df = confirmed_data[['Country_ISO3','ID']].groupby('Country_ISO3').agg('count').reset_index()
data_df.rename({'ID':'Total'}, axis = 1, inplace = True)
data_df.head()## Country_ISO3 are three-letter country codes
Out[7]:
Country_ISO3 Total
0 AND 4
1 ARE 16
2 ARG 49
3 AUS 71
4 AUT 198
In [8]:
fig = px.choropleth(data_df,
                    locations = "Country_ISO3",
                    color = "Total",
                    hover_name = "Country_ISO3",
                    color_continuous_scale = "peach",
                    projection = 'orthographic',
                    template = 'plotly_dark',
                    title = 'Geographical Distribution of Confirmed Monkeypox Cases<br><sub>Natural Projection</sub>',
                    height = 450,
                    width = 800,
                   )

fig.update_geos(lataxis_showgrid = True,
                lonaxis_showgrid = True,
                showcountries = True,
               )

fig.update_geos(lataxis = {'gridcolor':'#222222'},
                lonaxis = {'gridcolor':'#222222'},
               )

plt.show(block = False)
In [9]:
fig.show()
In [10]:
fig = px.scatter_geo(data_df, locations="Country_ISO3", 
                     color="Total",
                     hover_name="Country_ISO3", 
                     size="Total",
                     title = 'Geographical Distribution of Confirmed Monkeypox Cases using Bubble Maps',
                     projection="natural earth",
                     color_continuous_scale = ['#06FF00','#FFE400','#FF8E00','#FF1700'],                
                     template = 'plotly_dark',
                     height = 450,
                     width = 800,
                    )
plt.show(block = False)
In [11]:
fig.show()
In [12]:
fig = px.choropleth(data_df,
                    locations = "Country_ISO3",
                    color = "Total",
                    hover_name = "Country_ISO3",
                    scope = 'europe',
                    color_continuous_scale = "peach",
                    template = 'plotly_dark',
                    title = 'Distribution of Confirmed Monkeypox Cases<br><sub>European </sub>',
                    height = 450,
                    width = 800,
                   )

fig.update_geos(lataxis_showgrid = True,
                lonaxis_showgrid = True,
                showcountries = True,
                showsubunits = True,
               )

fig.update_geos(lataxis = {'gridcolor':'#222222'},
                lonaxis = {'gridcolor':'#222222'},
               )
plt.show(block = False)
In [13]:
fig.show()

Number of confirmed cases changes over time¶

  • data[['Date_confirmation','ID']].groupby('Date_confirmation').agg('count').reset_index(): count the number of confirmed cases by date
In [14]:
acumulated_df = data[['Date_confirmation','ID']].groupby('Date_confirmation').agg('count').reset_index()
acumulated_df['Accumulated Cases'] = acumulated_df['ID'].cumsum()
acumulated_df.rename({'ID':'Count'}, axis = 1, inplace = True)
acumulated_df.head()
Out[14]:
Date_confirmation Count Accumulated Cases
0 2022-01-31 2 2
1 2022-02-17 3 5
2 2022-02-28 1 6
3 2022-03-04 2 8
4 2022-03-31 6 14
In [15]:
fig = px.bar(acumulated_df,
             x = 'Date_confirmation',
             y = 'Count',
             text = 'Count',
             color = 'Count',
             opacity = 0.9,
             title = 'Number of Confirmed Cases by Date',
             color_continuous_scale = ['#bbbbbb','#38cae0'],
             template = 'plotly_dark',
             hover_data = ['Count'],#The hover_name property controls which column is displayed in bold as the tooltip title.
             color_continuous_midpoint = 1000,
             height = 450,
             width = 800,
            )
avg_returns = acumulated_df['Count'].mean()
avg_returns_color = '#bbbbbb'
fig.add_hline(y = avg_returns,
              line_width = 1.,
              line_dash = "dot",
              opacity = 0.7,
              fillcolor = avg_returns_color,
              annotation_text = "Average: " + str(round(avg_returns,1)) + ' cases per day', 
              annotation_position = "bottom right",
              annotation_font_size = 10,
              annotation_font_color = "white",
              line_color = avg_returns_color,
             )
fig.update_xaxes(showticklabels = True,
                 dtick = "M1",
                 tickformat = "%b %Y",
                )
fig.update_yaxes(title = 'Confirmed Cases')
fig.update_xaxes(title = 'Date')

plt.show(block = False)
In [16]:
fig.show()

Number of confirmed cases by days¶

  • .reindex(cat): reorder the data by the specific category
  • .to_datetime(): convert argument to datetime
  • .day_name(): return the day names
In [18]:
cats = [ 'Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
data_sort = pd.to_datetime(confirmed_data['Date_confirmation']).dt.day_name().value_counts().reindex(cats)
data_sort.head()
Out[18]:
Monday       7459
Tuesday      7932
Wednesday    5021
Thursday     6701
Friday       7328
Name: Date_confirmation, dtype: int64
In [19]:
fig = px.bar(data_sort,
             opacity = 0.9,
             title = 'Number of Confirmed Cases by Days',
             color_continuous_scale = ['#bbbbbb','#38cae0'],
             template = 'plotly_dark',
             #The hover_name property controls which column is displayed in bold as the tooltip title.
             color_continuous_midpoint = 0,
             height = 450,
             width = 800,
            )

plt.show(block = False)
In [20]:
fig.show()

Identify the most common symptoms associated with monkeypox¶

In [21]:
data.fillna("",inplace=True) # replace NaN by ""
def function(train):   
    comment_words = ""
    for i in train:
        val = str(i)
        tokens = val.split()
        for k in range(len(tokens)):
            tokens[k] = tokens[k].lower()
        comment_words += " ".join(tokens)+" "
    return comment_words

def plot_wordcloud(data):    
    from wordcloud import WordCloud, STOPWORDS
    stopwords = set(STOPWORDS)
    comment_words = function(data)
    wordcloud = WordCloud(width = 800, 
                          height = 450,
                          contour_color='#023075',
                          background_color ='black',
                          colormap='autumn',
                          min_font_size = 20,
                          collocations=False).generate(comment_words)
    # plot the WordCloud image 
    plt.figure(figsize = (16, 9),
           facecolor = None)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad = 0)
In [22]:
plot_wordcloud(data['Symptoms'])
In [24]:
temp_df = pd.DataFrame(data.loc[data['Symptoms'] != "",'Symptoms'].value_counts()).reset_index()
#if count=1 belongs to "other" category and count the total number
temp_df = temp_df.append(pd.DataFrame({'index':'Other', 'Symptoms':temp_df.loc[temp_df['Symptoms'] < 2]['Symptoms'].sum()},index = [0]))
temp_df = temp_df.loc[temp_df['Symptoms'] > 1]


fig = go.Figure(data = [go.Pie(labels = temp_df['index'],
                               values = temp_df['Symptoms'],
                               hole = .75,
                               #title = '% of Symptons',
                               marker_colors =px.colors.sequential.Agsunset,
                              )])
fig.update_layout(
    title_text = "Majorly affecting symptoms",
    template = 'plotly_dark',
    width = 800,
    height = 450,
    annotations = [dict(text = 'Symptoms',
                      x = 0.5,
                      y = 0.5,
                      font_size = 20,
                      showarrow = False
                     )])
plt.show(block = False)
In [25]:
fig.show()

Are patient characteristics (e.g., gender) associated with monkeypox?¶

In [26]:
data.loc[data['Gender'] == 'male','Gender'] = 'Male'
data.loc[data['Gender'] == 'male ','Gender'] = 'Male'
data.loc[data['Gender'] == 'female','Gender'] = 'Female'
data.loc[data['Gender'] == 'female ','Gender'] = 'Female'
filtered_df = data.loc[data['Status'] == 'confirmed']
filtered_df = filtered_df.loc[filtered_df['Gender'] != ""]

temp_df = filtered_df[['Gender','ID']].groupby('Gender').agg('count').reset_index()
temp_df.rename({'ID':'Count'}, axis = 1, inplace = True)
temp_df['Percentage'] = (temp_df['Count']/temp_df['Count'].sum() * 100).round(2)
temp_df['Percentage'] = temp_df['Percentage'].astype('str') + '%'
n = temp_df['Count'].sum()
n_total = data.shape[0]
fig = px.bar(temp_df,
             y = 'Gender',
             x = 'Count',
             title = 'Distribution of Sex Among Confirmed Cases<br><sub>Calculated on a sample of {} out of {} observations in the dataset</sub>'.format(n,n_total),
             color = 'Gender',
             text = 'Percentage',
             template = 'plotly_dark',
             opacity = 0.8,
             height = 450,
             width = 800,
             color_discrete_sequence = ['#dbdbdb','#38cae0']
            )
fig.update_yaxes(title = 'Count of Occurences')
fig.update_xaxes(title = 'Sex')
plt.show(block = False)
In [27]:
fig.show()